Python wbdc data annalysis ensemble methods

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
pwd
Out[2]:
'D:\\python\\R and python practice\\Module 20\\Assignment\\Archive'
In [3]:
df = pd.read_csv("wbcd.csv")
In [4]:
df.head()
Out[4]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst dimension_worst
0 87139402 B 12.32 12.39 78.85 464.1 0.10280 0.06981 0.03987 0.03700 ... 13.50 15.64 86.97 549.1 0.1385 0.1266 0.12420 0.09391 0.2827 0.06771
1 8910251 B 10.60 18.95 69.28 346.4 0.09688 0.11470 0.06387 0.02642 ... 11.88 22.94 78.28 424.8 0.1213 0.2515 0.19160 0.07926 0.2940 0.07587
2 905520 B 11.04 16.83 70.92 373.2 0.10770 0.07804 0.03046 0.02480 ... 12.41 26.44 79.93 471.4 0.1369 0.1482 0.10670 0.07431 0.2998 0.07881
3 868871 B 11.28 13.39 73.00 384.8 0.11640 0.11360 0.04635 0.04796 ... 11.92 15.77 76.53 434.0 0.1367 0.1822 0.08669 0.08611 0.2102 0.06784
4 9012568 B 15.19 13.21 97.65 711.8 0.07963 0.06934 0.03393 0.02657 ... 16.20 15.73 104.50 819.1 0.1126 0.1737 0.13620 0.08178 0.2487 0.06766

5 rows × 32 columns

In [5]:
df.keys()
Out[5]:
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'points_mean', 'symmetry_mean', 'dimension_mean', 'radius_se',
       'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'points_se', 'symmetry_se',
       'dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
       'area_worst', 'smoothness_worst', 'compactness_worst',
       'concavity_worst', 'points_worst', 'symmetry_worst', 'dimension_worst'],
      dtype='object')
In [7]:
df.describecribecribe()
Out[7]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst points_worst symmetry_worst dimension_worst
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 31 columns

In [6]:
sns.pairplot(df,hue='diagnosis',palette='Set1')
Out[6]:
<seaborn.axisgrid.PairGrid at 0x205daed1088>

Train test Split

In [9]:
X = df.drop('diagnosis',axis=1)
y = df['diagnosis']
In [10]:
from sklearn.model_selection import train_test_split
In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

Bagging Classifier

In [12]:
from sklearn.ensemble import BaggingClassifier # imported bagging classifier algorithm 
BaggingClassifier?
In [13]:
from sklearn.neighbors import KNeighborsClassifier
m = KNeighborsClassifier(n_neighbors=3)
In [14]:
# lets make a bagging classifier
bag = BaggingClassifier(
    m, 
    max_samples=.5, 
    max_features=2, 
    n_jobs=2,
    oob_score=True)
In [15]:
bag.fit(X_train, y_train) # 
Out[15]:
BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto',
                                                      leaf_size=30,
                                                      metric='minkowski',
                                                      metric_params=None,
                                                      n_jobs=None,
                                                      n_neighbors=3, p=2,
                                                      weights='uniform'),
                  bootstrap=True, bootstrap_features=False, max_features=2,
                  max_samples=0.5, n_estimators=10, n_jobs=2, oob_score=True,
                  random_state=None, verbose=0, warm_start=False)

Predictions and Evaluations of Bagging

In [16]:
pred = bag.predict(X_test)
In [17]:
from sklearn.metrics import classification_report,confusion_matrix
In [18]:
print(confusion_matrix(y_test,pred))
[[104   1]
 [  9  57]]
In [19]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           B       0.92      0.99      0.95       105
           M       0.98      0.86      0.92        66

    accuracy                           0.94       171
   macro avg       0.95      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171

In [20]:
bag.oob_score_
Out[20]:
0.9045226130653267
In [21]:
bag.score(X, y)
Out[21]:
0.9402460456942003

Descision Tree classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifier?
In [23]:
Ds = DecisionTreeClassifier()
In [24]:
Ds.fit(X,y)
Out[24]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

Predictions and Evaluations of Descision Tree classifer

In [25]:
pred = Ds.predict(X_test) 
In [26]:
print(confusion_matrix(y_test,pred))
[[105   0]
 [  0  66]]
In [27]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           B       1.00      1.00      1.00       105
           M       1.00      1.00      1.00        66

    accuracy                           1.00       171
   macro avg       1.00      1.00      1.00       171
weighted avg       1.00      1.00      1.00       171

In [28]:
Ds.score(X,y)
Out[28]:
1.0

Random Forests

In [29]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier?
In [30]:
Rm = RandomForestClassifier(n_estimators=20, oob_score=True)
In [31]:
Rm.fit(X, y)
Out[31]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

Predictions and Evaluations of Random classifer

In [32]:
pred = Rm.predict(X_test) 
In [33]:
print(confusion_matrix(y_test,pred))
[[105   0]
 [  0  66]]
In [34]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           B       1.00      1.00      1.00       105
           M       1.00      1.00      1.00        66

    accuracy                           1.00       171
   macro avg       1.00      1.00      1.00       171
weighted avg       1.00      1.00      1.00       171

In [35]:
Rm.score(X, y)
Out[35]:
0.9982425307557118

AdaBoost Classifier

In [37]:
from sklearn.ensemble import AdaBoostClassifier
AdaBoostClassifier?
In [38]:
Ad = AdaBoostClassifier(base_estimator=None, n_estimators=100)
In [39]:
Ad.fit(X, y)
Out[39]:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=None)

Predictions and Evaluations of AdABoost classifer

In [40]:
pred = Ad.predict(X_test) 
In [41]:
print(confusion_matrix(y_test,pred))
[[105   0]
 [  0  66]]
In [42]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           B       1.00      1.00      1.00       105
           M       1.00      1.00      1.00        66

    accuracy                           1.00       171
   macro avg       1.00      1.00      1.00       171
weighted avg       1.00      1.00      1.00       171

In [43]:
Ad.score(X, y)
Out[43]:
1.0

Gradient Tree Boosting

In [44]:
from sklearn.ensemble import GradientBoostingClassifier
GradientBoostingClassifier?
In [45]:
Gb = GradientBoostingClassifier(n_estimators=10)

Gb.fit(X, y)

Gb.score(X, y)
Out[45]:
0.9824253075571178

Predictions and Evaluations of Gradient Tree Boosting

In [46]:
pred = Gb.predict(X_test) 
In [47]:
print(confusion_matrix(y_test,pred))
[[105   0]
 [  1  65]]
In [48]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           B       0.99      1.00      1.00       105
           M       1.00      0.98      0.99        66

    accuracy                           0.99       171
   macro avg       1.00      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171

In [49]:
Gb.score(X, y)
Out[49]:
0.9824253075571178

Voting Classifier

In [50]:
from sklearn.ensemble import VotingClassifier
VotingClassifier?
In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


Vm = VotingClassifier(
    estimators=[('lr', LogisticRegression()), 
                ('rf', RandomForestClassifier()), 
                ('gnb', GaussianNB())], voting='hard')
In [52]:
Vm.fit(X, y)
Out[52]:
VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     cr...
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_jobs=None,
                                                     oob_score=False,
                                                     random_state=None,
                                                     verbose=0,
                                                     warm_start=False)),
                             ('gnb',
                              GaussianNB(priors=None, var_smoothing=1e-09))],
                 flatten_transform=True, n_jobs=None, voting='hard',
                 weights=None)
In [53]:
Vm.score(X, y)
Out[53]:
0.6362038664323374
In [ ]: